******************************************************************************************************************************************************
*This do file extracts, cleans and appends the 2006-2012 GSO datasets.	
*	a.	Input files:
*		i.	dn2006.dta – dn2010.dta
*	b.	Output files:
*		i.	GSO_CLEAN.dta									   																	    *
******************************************************************************************************************************************************

clear all
set more off
set mem 600m
set maxvar 3000


*SET DIRECTORY HERE*
global dir_data_original /Users/Jie/Dropbox (Personal)/CorruptionIncome/EJFinalSubmission/data&program/original raw data/
global dir_data_coded /Users/Jie/Dropbox (Personal)/CorruptionIncome/EJFinalSubmission/data&program/intermediary data/

clear all

local keepvars  "year tinh tn1* tn2* ts* kqkd* ld11 ld13 ncap4 tinh lhdn madn macs nganh_kd nganh_cu capso von_nn kqkd7"
#delimit ;
local dropvars  "tn21 tn22 tn101 tn102 tn111 tn112 tn121 tn122 ts3* ts4* ts5* ts6* ts7* ts9* ts13* ts14* ts15* ts16* ts17* ts18* ts19* ts2*  ts101 ts102 ts111 ts112 ts121 ts122
 								kqkd4 kqkd5 kqkd6 kqkdn* kqkd9 kqkd10 kqkd11 kqkd12 kqkd13 kqkd14 kqkd15 kqkd16 kqkd17 kqkd18 kqkd19  kqkd81
 								kqkd91 kqkd3 kqkd96 tn71 ts811 ts812";
#delim cr

tempfile pooled

foreach y in 2010 2009 2008 2007 2006 {
	use "${dir_data_original}/GSO/dn`y'.dta", clear
	gen year =`y'
	cap gen nganh_cu=.  
	keep `keepvars'
	foreach v in `dropvars' {
		cap drop `v'
	}
	drop if kqkdc==. & ld13 ==.  /* firm-level obsn where number of employees and revenues are both missing missing */ 
	cap append using `pooled'
	save `pooled', replace
}

/*Recode variables that differ across waves*/
cap replace tn1=tn11 if year==2009  
cap replace tn2=tn12 if year==2009
cap drop tn11 tn12

bys year madn tinh: gen dup = _N
tab dup
drop if dup>1 
drop dup

/*Sector is a String with Two Elements - Letters for Main Sector, Numbers for Codes*/
split ncap4, generate(isic) parse(A B C D E F G H I J K L M N O P Q R S T U V ) limit(2)
rename  isic2 isic

destring  tinh, generate(province)
lab var province "GSO Province Code"
drop tinh

egen firm_id=group(province madn macs)
drop madn macs


****** END OF CLEANUP ****** 

rename isic isic_rev4_4digit

*ADD LEADING ZERO
replace isic_rev4_4digit = "0" + isic_rev4_4digit if length(isic_rev4_4digit)<4 & length(isic_rev4_4digit)>2
gen isic_rev4_2digit = substr(isic_rev4_4digit,1, 2)
lab var isic_rev4_2digit "ISIC Revision 4 Digit Code (2 Digits)"
lab var isic_rev4_4digit "ISIC Revision 4 Digit Code (4 Digits)"

*FILL IN ISIC CODE
replace isic_rev4_2digit = "0" + isic_rev4_2digit if length(isic_rev4_2digit)==1  

drop if isic_rev4_2digit==""

tab isic_rev4_2digit, mi
replace isic_rev4_4digit = "" if length(isic_rev4_4digit)~=4

foreach v of var kqkd1 kqkdc ld13 ts12 {
	replace `v' = . if `v' < 0
	}
	
ren kqkd1 revenue
ren kqkdc revenue_main
ren ld13 employ
ren ts12 assets
ren kqkd7 profit

*GET BROAD INDUSTRY CODE
sort isic_rev4_2digit
tostring isic_rev4_2digit,replace
merge m:1 isic_rev4_2digit using "${dir_data_original}/Crosswalks/isic_alphabet_crosswalk.dta"
keep if _merge==3 
drop _merge

keep isic_rev4_2digit broad province year employ profit revenue revenue_main assets lhdn

g lnemploy=ln(employ)
g r=province
g t=year

*GENERATE INDUSTRY CODE j
drop if broad==""
encode broad, g(j)

*GENERATE GROUP IDENTIFIERS
egen rjt = group(r j t)
egen rj = group(r j)
egen rt = group(r t)
egen jt = group(j t)

save "${dir_data_coded}/GSO_CLEAN.dta", replace



